Assignment 1. Perception Visualization
olive <- read.csv("olive.csv", sep = ",", header = TRUE)
library(ggplot2)
library(plotly)
library(MASS)
library(gridExtra)
p_vs_o <- ggplot(olive, aes(x=oleic, y=palmitic,
color = linolenic)) +
geom_point()
#xlab("")
p_vs_o

olive$linolenic2 <- cut_interval(olive$linolenic, n=4)
p_vs_o2 <- ggplot(olive, aes(x=oleic, y=palmitic, color = linolenic2)) +
geom_point()
p_vs_o2

pal_vs_ole_col <- ggplot(olive, aes(palmitic, oleic)) +
geom_point(aes(color=linolenic2))
pal_vs_ole_col

pal_vs_ole_size <- ggplot(olive, aes(palmitic, oleic)) +
geom_point(aes(size=linolenic2))
pal_vs_ole_size

pal_vs_ole_angle <- ggplot(olive, aes(palmitic, oleic)) +
geom_point() +
geom_spoke(aes(angle=as.numeric(linolenic2),radius=20))
pal_vs_ole_angle

o_vs_e <- ggplot(olive, aes(x=oleic, y=eicosenoic, color = Region)) +
geom_point()
o_vs_e

o_vs_e0 <- ggplot(olive, aes(x=oleic, y = eicosenoic, color = as.factor(Region)))+
geom_point()
o_vs_e0

olive$linoleic2 <- cut_interval(olive$linoleic, n=3)
olive$palmitic2 <- cut_interval(olive$palmitic, n=3)
olive$palmitoleic2 <- cut_interval(olive$palmitoleic, n=3)
o_vs_e1 <- ggplot(olive, aes(x=oleic, y=eicosenoic)) +
geom_point(aes(color=linoleic2, size=palmitoleic2, shape=palmitic2))
o_vs_e1

o_vs_e2 <- ggplot(olive, aes(x=oleic, y=eicosenoic)) +
geom_point(aes(color=Region, size=palmitoleic2, shape=palmitic2))
o_vs_e2

plot_ly(data = olive, labels=~Area, type = 'pie', showlegend = F) %>%
layout(title = 'Proportion of Oils from different regions',
xaxis = list(showgrid = F, zeroline = F, showticklables = F),
yaxis = list(showgrid = F, zeroline = F, showticklables = F))
l_vs_e <- ggplot(olive, aes(x = linoleic, y = eicosenoic)) +
geom_point() +
geom_density_2d()#+
#stat_density_2d(aes(fill = stat(level)), geom = "polygon")
l_vs_e

Assignment 2: Multidimensional Scaling of a high-dimensional object
# reading the data
baseball <- read.csv("baseball-2016.csv", sep = ",", header = TRUE)
Question1
The xlsx file is converted to a csv file, makes it easier to load the data without extra packages. It is reasonable to scale the data because they have different ranges so that features with large scales do not dominate. Example, comparing maximum value and minimum value for “AB” and “Home runs per game”. The two are on totally different scales.
# range of AB
range1 <- max(baseball$AB) - min(baseball$AB)
range1
[1] 340
# range of HR per game
range2 <- max(baseball$HR.per.game) - min(baseball$HR.per.game)
range2
[1] 0.8039644
The density plot below graphically explains why scaling is needed. Appropriate density should be similar to log normal density.
rr plot(density(as.matrix(baseball[3:ncol(baseball)])))

Question 2
# scale the 26 numeric columns
baseball2 <- scale(baseball[ ,3:ncol(baseball)])
# Getting distance between points using Minkowski
mink_d <- dist(baseball2, method = "minkowski", p =2)
# get 2 column vector with fitted configuration
resid <- isoMDS(mink_d, k=2 )
initial value 19.856833
iter 5 value 16.319153
iter 10 value 16.046215
final value 15.935476
converged
# getting the coordinates
coords <- resid$points
# convert coords to dataframe
coordsMDS <- as.data.frame(coords)
# cbind to get column Team and League
coordsMDS2 <- cbind(baseball[ , 1:2], coordsMDS)
# plot the new dimension and color by League
plot_ly(coordsMDS2, x= ~V1, y = ~V2, type ="scatter", hovertext = ~ Team,
color = ~League, colors = c("#0444BF","#F46A4E"))
There appears to be a difference between the two leagues; most of the NL league teams are on the second and third quadrant (negative V2) while most of the AL league teams are on the first and fourth quadrant (positive V2). The y-axis “V2” has the best differentation between the leagues. The Boston Red Sox, Atlanta Braves and Philadelphia Philies appear to be outliers.
rr # Shepard diagrams assess the goodness-of-fit of MDS techniques # run shep on points from isoMDS and using same distance calculated in previous step shep <- Shepard(mink_d, coords)
# convert the distance to numeric
delta <- as.numeric(mink_d)
# possibly a matrix of the coords
D <- as.numeric(dist(coords))
# create a square matrix with dimensions from the rows of coords
# n as rows
n = nrow(coords)
# index as empyt matrix
index <- matrix(1:n, nrow = n, ncol = n)
# get index of the lower triangle of the index matrix and convert it to numeric
index1 <- as.numeric(index[lower.tri(index)])
# same procedur
n = nrow(coords)
index=matrix(1:n, nrow=n, ncol=n, byrow = T)
index2=as.numeric(index[lower.tri(index)])
# plotly
plot_ly()%>%
add_markers(x=~delta, y=~D, hoverinfo = 'text',
text = ~paste('Obj1: ', rownames(baseball)[index1],
'<br> Obj 2: ', rownames(baseball)[index2])) #%>%
#if nonmetric MDS inolved
#add_lines(x=~sh$x, y=~sh$yf)
rr # get MDS variable V2 and cbind with points from isoMDS d4 <- cbind(baseball[ ,1:2], coords[,2], baseball2[,])
rename col3 to V2
colnames(d4)[3] <- c(2)
rr # function to successively get plots of V2 vs other variables myplot <-function(df, y_string){ ggplot(df, aes_string(x = 2, y = y_string, color = ))+ geom_point()+ geom_vline(xintercept = 0, color = )+ geom_hline(yintercept = 0, color = ) }
rr # Variables that seem to have strongest postive relationship # HR per game: myplot(d4, .per.game)
rr # HR myplot(d4, 3B)
Per the plot, HR per game and HR have the same scatter points. The two have strongest positve connection to V2 while X3B has the strongest negative connection to V2.
---
title: "Visualization Lab 2"
author: "Roshni Sundaramurthy (rossu809); Brian Masinde (brima748)"
date: "17 September 2018"
output:
  html_notebook:
    theme: journal
  html_document:
    df_print: paged
fontsize: 11pt
---

### Assignment 1. Perception Visualization

```{r data}
olive <- read.csv("olive.csv", sep = ",", header = TRUE)
```

```{r libraries, message=FALSE, warning=FALSE, paged.print=FALSE}
library(ggplot2)
library(plotly)
library(MASS)
library(gridExtra)
```

```{r question1_a}
p_vs_o <- ggplot(olive, aes(x=oleic, y=palmitic,
                     color = linolenic)) +
  geom_point() 
  #xlab("")
p_vs_o
```

```{r cut_interval}
olive$linolenic2 <- cut_interval(olive$linolenic, n=4)
```


```{r question1_b}
p_vs_o2 <- ggplot(olive, aes(x=oleic, y=palmitic, color = linolenic2)) + 
  geom_point()

p_vs_o2
```

```{r question 2_a, warning=FALSE}
pal_vs_ole_col <- ggplot(olive, aes(palmitic, oleic)) +
  geom_point(aes(color=linolenic2))

pal_vs_ole_col
```


```{r question 2_b, warning = FALSE}
pal_vs_ole_size <- ggplot(olive, aes(palmitic, oleic)) +
  geom_point(aes(size=linolenic2))

pal_vs_ole_size
```

```{r question 2_c, warning=FALSE}
pal_vs_ole_angle <- ggplot(olive, aes(palmitic, oleic)) +
  geom_point() +
  geom_spoke(aes(angle=as.numeric(linolenic2),radius=20))

pal_vs_ole_angle
```

```{r Question3}
o_vs_e <- ggplot(olive, aes(x=oleic, y=eicosenoic, color = Region)) + 
  geom_point()
o_vs_e
```

```{r question3_b}
o_vs_e0 <- ggplot(olive, aes(x=oleic, y = eicosenoic, color = as.factor(Region)))+
  geom_point()

o_vs_e0
```


```{r discretization}
olive$linoleic2 <- cut_interval(olive$linoleic, n=3)
olive$palmitic2 <- cut_interval(olive$palmitic, n=3)
olive$palmitoleic2 <- cut_interval(olive$palmitoleic, n=3)
```

```{r question4, warning=FALSE}
o_vs_e1 <- ggplot(olive, aes(x=oleic, y=eicosenoic)) + 
  geom_point(aes(color=linoleic2, size=palmitoleic2, shape=palmitic2))
o_vs_e1
```

```{r question5, warning=FALSE}
o_vs_e2 <- ggplot(olive, aes(x=oleic, y=eicosenoic)) + 
  geom_point(aes(color=Region, size=palmitoleic2, shape=palmitic2))

o_vs_e2
```

```{r question6}
plot_ly(data = olive, labels=~Area, type = 'pie', showlegend = F) %>%
  layout(title = 'Proportion of Oils from different regions',
        xaxis = list(showgrid = F, zeroline = F, showticklables = F),
         yaxis = list(showgrid = F, zeroline = F, showticklables = F))

```

```{r question7}
l_vs_e <- ggplot(olive, aes(x = linoleic, y = eicosenoic)) +
  geom_point() + 
  geom_density_2d()#+
  #stat_density_2d(aes(fill = stat(level)), geom = "polygon")
l_vs_e  
```

### Assignment 2: Multidimensional Scaling of a high-dimensional object

```{r}
# reading the data
baseball <- read.csv("baseball-2016.csv", sep = ",", header = TRUE)
```

#### Question1
The xlsx file is converted to a csv file, makes it easier to load the data without extra packages. It is reasonable to scale the data because they have different ranges so that features with large scales do not dominate. Example, comparing maximum value and minimum value for "AB" and "Home runs per game". The two are on totally different scales.

```{r}
# range of AB
range1 <- max(baseball$AB) -  min(baseball$AB)
range1
```

```{r}
# range of HR per game
range2 <- max(baseball$HR.per.game) - min(baseball$HR.per.game)

range2
```

The density plot below graphically explains why scaling is needed. Appropriate density should be similar to log normal density.
```{r base_density}
plot(density(as.matrix(baseball[3:ncol(baseball)])))
```


#### Question 2

```{r}
# scale the 26 numeric columns
baseball2 <- scale(baseball[ ,3:ncol(baseball)])
```

```{r}
# Getting distance between points using Minkowski
mink_d <- dist(baseball2, method = "minkowski", p =2)
```

```{r message=FALSE, warning=FALSE, paged.print=FALSE}
# get 2 column vector with fitted configuration 
resid <- isoMDS(mink_d, k=2 )

# getting the coordinates
coords <- resid$points

# convert coords to dataframe
coordsMDS <- as.data.frame(coords)

# cbind to get column Team and League
coordsMDS2 <- cbind(baseball[ , 1:2], coordsMDS)
```

```{r message=FALSE, warning=FALSE, paged.print=FALSE}
# plot the new dimension and color by  League
plot_ly(coordsMDS2, x= ~V1, y = ~V2, type ="scatter", hovertext = ~ Team,
            color = ~League, colors = c("#0444BF","#F46A4E"))
```

There appears to be a difference between the two leagues; most of the NL league teams are on the second and third quadrant (negative V2) while most of the AL league teams are on the first and fourth quadrant (positive V2). The y-axis "V2" has the best differentation between the leagues. The Boston Red Sox, Atlanta Braves and Philadelphia Philies appear to be outliers.

```{r question3_2}
# Shepard diagrams assess the goodness-of-fit of MDS techniques
# run shep on points from isoMDS and using same distance as calculated in previous step
shep <- Shepard(mink_d, coords)
```

```{r}
# convert the distance to numeric
delta <- as.numeric(mink_d)
```


```{r}
# All the coords in one column
D <- as.numeric(dist(coords))
```

```{r}
# create a square matrix with dimensions from the rows of coords
# n as rows 
n = nrow(coords)

# index as empyt matrix
index <- matrix(1:n, nrow = n, ncol = n)

# get index of the lower triangle of the index matrix and convert it to numeric
index1 <- as.numeric(index[lower.tri(index)])

```

```{r}
# same procedur
n = nrow(coords)

index <- matrix(1:n, nrow=n, ncol=n, byrow = T)

index2 <- as.numeric(index[lower.tri(index)])
```

```{r}
# plotly
plot_ly()%>%
  add_markers(x=~delta, y=~D, hoverinfo = 'text',
        text = ~paste('Obj1: ', rownames(baseball)[index1],
                      '<br> Obj 2: ', rownames(baseball)[index2])) #%>%
  #if nonmetric MDS inolved
  #add_lines(x=~sh$x, y=~sh$yf)
```


```{r question4_2}
# get MDS variable V2 and cbind with points from isoMDS
d4 <- cbind(baseball[ ,1:2], coords[,2], baseball2[,])

# rename col3 to V2
colnames(d4)[3] <- c("V2")
```

```{r fxn1, echo=FALSE, eval=FALSE}
# Consulted Naveen Gabriel for the function to produce successive scatter plots

myplot=list()

p <- function(dataset){
  j<-1
  for (i in 4:ncol(dataset) ){
    print(i)
    myplot[[j]] <- ggplot(dataset, aes_string(colnames(dataset)[3], colnames(dataset)[i],color="League"))+
      geom_point()
    
    j<-j+1
    
  }
  
  grid.arrange(grobs=myplot)
  # return(myplot)
}

p(d4)
```


```{r fxn2}
# function to successively get plots of V2 vs other variables
myplot <-function(df, y_string){
  ggplot(df, aes_string(x = "V2", y = y_string, color = "League"))+
    geom_point()+
    geom_vline(xintercept = 0, color = "black")+
    geom_hline(yintercept = 0, color = "black")
}
```

```{r}
# Variables that seem to have strongest postive relationship
# HR per game:
myplot(d4, "HR.per.game")
```

```{r}
# HR
myplot(d4, "X3B")
```

Per the plot, HR per game and HR have the same scatter points. The two have strongest positve connection to V2 while X3B has the strongest negative connection to V2.

